rm(list = ls())

# Purpose: Generate average topic-word probabilities for each model

### PARAMETERS ########################
N.years <- 79
#######################################

### PATHS ##############################
topic.probs <- "./data"
word.id.file <- "./data/word_ids.dat"
output.file.path <- "./data"
#######################################

# open file with word labels
words <- read.table(word.id.file, stringsAsFactors=FALSE)
names(words) <- c("word", "freq", "id")
no.words <- nrow(words)

for (topic.model in seq(10,40,1)) {

    cat(paste("Generating topic-word probabilities for model",topic.model,"\n"))
    
    # empty matrix to hold results
    avg.probs <- matrix(NA, nrow=no.words, ncol=topic.model)
    rownames(avg.probs) <- sort(words$word)
    
    # loop over e-log files to extract probabilities for each topic
    for (i in 0:(topic.model-1)) {
        cat(paste("Computing average probs for topic", i+1,"\n"))
        
        # generate topic number in file name
        topicIn <- paste0(paste(rep(0,3-nchar(as.character(i))), collapse=""),i)
        topicOut <- i+1
     
        # generate file name
        fname <- file.path(topic.probs, paste0(topic.model,"-topics_100_iterations_dtm_output/lda-seq/topic-", topicIn, "-var-e-log-prob.dat"))
     
        # open file and generate matrix with probabilities
        e.logs <- matrix(scan(fname), ncol=N.years, byrow=TRUE)
        probs <- exp(e.logs)
        row.names(probs) <- words$word
               
        # add average word probabilities to matrix
        probs <- probs[order(rownames(probs)),]
        avg.probs[ , topicOut] <- as.numeric(apply(probs, 1, mean))
                    
    }    

    # write results to output file
    fname <- file.path(output.file.path, paste0("avg_probabilities_topic_",topicOut,".csv"))
    write.table(avg.probs, file=fname, row.names=TRUE, col.names=FALSE)

    
}

